suppressMessages(library(readxl))
suppressMessages(library(plotly))
suppressMessages(library(randomForest))
suppressMessages(library(dplyr))
suppressMessages(library(GGally))
suppressMessages(library(ggplot2))
suppressMessages(library(scatterplot3d))
suppressMessages(library(plm))
########################################################
## Clean Data
##
## Delete the data which height and weight = 'NA' (Invalid BMI data)
########################################################
dat_ex_all <- read_excel("./SharedFiles/ST606/2020/data/Exercise/fit_database_anthropometric_all.xlsx", sheet=1, na='NA')
# Clean data (delete all data that the follow conditions are NA)
dat <- subset(dat_ex_all, `height (cm)` != 'NA'
& `weight (kg)` != 'NA')
# Change the date to year
dat$mYear <- substr(as.character(dat$`measurement date`), start=1, stop=4)
# Change the colnames
colnames(dat)[10] <- 'z_category'
colnames(dat)[9] <- 'z_score'
colnames(dat)[2] <- 'm_date'
colnames(dat)[3] <- 'age'
colnames(dat)[4] <- 'age_bin'
colnames(dat)[6] <- 'height'
colnames(dat)[7] <- 'weight'
dataSta <- data.frame(all = nrow(dat_ex_all), obj = nrow(dat), del = nrow(dat_ex_all) - nrow(dat))
# new age bin by 0.5 year
dat$new_age_bin = as.factor(ifelse((dat$age - dat$age_bin)>= 0.5,dat$age_bin + 0.5 , dat$age_bin))
# delete error data
dat$indx <- paste(dat$ID, dat$new_age_bin, sep='||')
index <- duplicated(dat[,13])
dat_u <- dat[!index, 1:12]
########################################################
## Show the pie chart
## data: input data
## type: 0:by year 1:by age
## value: condition
## gender: 0:Girl 1:Boy 2:All
########################################################
showPie <- function(data, type, value, gender) {
title <- 'The Healthy Ratio'
# by year
if (type == 0) {
title <- paste(title," in ", value)
# Gender:All
if (gender == 2) {
# all
nums <- c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value ),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value ),]),nrow(dat_uc[which(dat_uc$z_category=="overweight"& dat_uc$mYear==value ),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$mYear==value ),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$mYear==value ),]))
drawPie(nums, title)
}
# Gender:Boy
else if (gender == 1) {
# boy
title1 <- paste(title," (Boy)")
nums <- c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$mYear==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$mYear==value & dat_uc$gender=='boy'),]))
drawPie(nums, title1)
}
# Gender:Girl
else if (gender == 0) {
# girl
title2 <- paste(title," (Girl)")
nums <- c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$mYear==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$mYear==value & dat_uc$gender=='girl'),]))
drawPie(nums, title2)
}
}
# by age bin
else {
title <- paste(title," ", value, " year-old")
# Gender:All
if (gender == 2) {
# all
nums <- c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value ),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value ),]),nrow(dat_uc[which(dat_uc$z_category=="overweight"& dat_uc$age_bin==value ),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$age_bin==value ),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$age_bin==value ),]))
drawPie(nums, title)
}
# Gender:Boy
else if (gender == 1) {
# boy
title1 <- paste(title," (Boy)")
nums <- c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$age_bin==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$age_bin==value & dat_uc$gender=='boy'),]))
drawPie(nums, title1)
}
# Gender:Girl
else if (gender == 0) {
# girl
title2 <- paste(title," (Girl)")
nums <- c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$age_bin==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$age_bin==value & dat_uc$gender=='girl'),]))
drawPie(nums, title2)
}
}
}
########################################################
## Show the pie chart
## data: input data
## type: 0:by year 1:by age
## value: condition
## gender: 0:Girl 1:Boy 2:All
########################################################
getRatio <- function(data, type, gender) {
pDataNor <- c()
pDataObe <- c()
pDataOver <- c()
pDataSThin <- c()
pDataThin <- c()
# by year
if (type == 0) {
# Gender:All
if (gender == 2) {
# all
for (value in 2007:2018) {
norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value),])/nrow(dat_uc[which(dat_uc$mYear==value),])
pDataNor <- c(pDataNor, norPer)
norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value ),])/nrow(dat_uc[which(dat_uc$mYear==value),])
pDataObe <- c(pDataObe, norObe)
norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value ),])/nrow(dat_uc[which(dat_uc$mYear==value),])
pDataOver <- c(pDataOver, norOver)
norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$mYear==value ),])/nrow(dat_uc[which(dat_uc$mYear==value),])
pDataSThin <- c(pDataSThin, norSThin)
norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$mYear==value ),])/nrow(dat_uc[which(dat_uc$mYear==value),])
pDataThin <- c(pDataThin, norThin)
}
pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
pData
}
# Gender:Boy
else if (gender == 1) {
# boy
for (value in 2007:2018) {
norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value & dat_uc$gender=='boy'),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
pDataNor <- c(pDataNor, norPer)
norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
pDataObe <- c(pDataObe, norObe)
norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
pDataOver <- c(pDataOver, norOver)
norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$mYear==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
pDataSThin <- c(pDataSThin, norSThin)
norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$mYear==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
pDataThin <- c(pDataThin, norThin)
}
pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
pData
}
# Gender:Girl
else if (gender == 0) {
# girl
for (value in 2007:2018) {
norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value & dat_uc$gender=='girl'),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
pDataNor <- c(pDataNor, norPer)
norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
pDataObe <- c(pDataObe, norObe)
norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
pDataOver <- c(pDataOver, norOver)
norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$mYear==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
pDataSThin <- c(pDataSThin, norSThin)
norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$mYear==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
pDataThin <- c(pDataThin, norThin)
}
pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
pData
}
}
# by age bin
else {
# Gender:All
if (gender == 2) {
# all
for (value in 6:18) {
norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
pDataNor <- c(pDataNor, norPer)
norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value ),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
pDataObe <- c(pDataObe, norObe)
norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value ),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
pDataOver <- c(pDataOver, norOver)
norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$age_bin==value ),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
pDataSThin <- c(pDataSThin, norSThin)
norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$age_bin==value ),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
pDataThin <- c(pDataThin, norThin)
}
pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
pData
}
# Gender:Boy
else if (gender == 1) {
# boy
for (value in 6:18) {
norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value & dat_uc$gender=='boy'),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
pDataNor <- c(pDataNor, norPer)
norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
pDataObe <- c(pDataObe, norObe)
norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
pDataOver <- c(pDataOver, norOver)
norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$age_bin==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
pDataSThin <- c(pDataSThin, norSThin)
norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$age_bin==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
pDataThin <- c(pDataThin, norThin)
}
pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
pData
}
# Gender:Girl
else if (gender == 0) {
# girl
for (value in 6:18) {
norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value & dat_uc$gender=='girl'),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
pDataNor <- c(pDataNor, norPer)
norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
pDataObe <- c(pDataObe, norObe)
norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
pDataOver <- c(pDataOver, norOver)
norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$age_bin==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
pDataSThin <- c(pDataSThin, norSThin)
norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$age_bin==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
pDataThin <- c(pDataThin, norThin)
}
pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
pData
}
}
}
########################################################
## Draw the pie chart
## nums: count by category
## title: title
########################################################
drawPie <- function(nums, title){
# Create Data
data <- data.frame(
Category=c('normal','obese','overweight','severely thin', 'thin' ),
value=nums
)
# Edit label
label_value <- paste('(', round(data$value/sum(data$value) * 100, 1), '%)', sep = '')
label <- paste(data$Category, label_value, sep = '')
# Basic piechart
ggplot(data, aes(x="", y=value, fill=Category)) +
geom_bar(stat="identity", width=1, color="white") +
theme_void() + coord_polar(theta = 'y') + labs(x = '', y = '', title = '') + theme(axis.text = element_blank()) + theme(axis.ticks = element_blank()) + scale_fill_discrete(labels = label) +
ggtitle(title)
}
########################################################
## Show trend chart
## data: input data
## type: 0:by year 1:by age
## gender: 0:Girl 1:Boy 2:All
########################################################
showTrend <- function(data, ty, gender) {
if (ty == 0) {
title <- 'The trend of Healthy Ratio 2007~2018'
if (gender == 1) {
title <- paste(title, " (Boy)")
} else if (gender == 0) {
title <- paste(title, " (Girl)")
}
Year <- rep(2007:2018, times = 5)
type <- rep(c('normal','obese','overweight','severely thin', 'thin'), each = 12)
Proportion <- getRatio(data, ty, gender)
df <- data.frame(year = Year, type = type, proportion = Proportion)
ggplot(data = df, mapping = aes(x = Year, y = Proportion, colour = type)) + geom_line()+
ggtitle(title)
} else {
title <- 'The trend of Healthy Ratio 6~18 year-old'
if (gender == 1) {
title <- paste(title, " (Boy)")
} else if (gender == 0) {
title <- paste(title, " (Girl)")
}
Age <- rep(6:18, times = 5)
type <- rep(c('normal','obese','overweight','severely thin', 'thin'), each = 13)
Proportion <- getRatio(data, ty, gender)
df <- data.frame(Age = Age, type = type, proportion = Proportion)
ggplot(data = df, mapping = aes(x = Age, y = Proportion, colour = type)) + geom_line()+
ggtitle(title)
}
}
########################################################
## z_category by age
##
## the percentage of different age
########################################################
# ALL
dat_u$z_category = as.factor(dat_u$z_category)
dat_uc <- dat_u
dat_uc$count <- rep(1, times = nrow(dat_uc))
# Show the pie by age
showPie(dat_uc, 1, 6, 2)

showPie(dat_uc, 1, 6, 1)

showPie(dat_uc, 1, 6, 0)

showPie(dat_uc, 1, 7, 2)

showPie(dat_uc, 1, 7, 1)

showPie(dat_uc, 1, 7, 0)

showPie(dat_uc, 1, 8, 2)

showPie(dat_uc, 1, 8, 1)

showPie(dat_uc, 1, 8, 0)

showPie(dat_uc, 1, 9, 2)

showPie(dat_uc, 1, 9, 1)

showPie(dat_uc, 1, 9, 0)

showPie(dat_uc, 1, 10, 2)

showPie(dat_uc, 1, 10, 1)

showPie(dat_uc, 1, 10, 0)

showPie(dat_uc, 1, 11, 2)

showPie(dat_uc, 1, 11, 1)

showPie(dat_uc, 1, 11, 0)

showPie(dat_uc, 1, 12, 2)

showPie(dat_uc, 1, 12, 1)

showPie(dat_uc, 1, 12, 0)

showPie(dat_uc, 1, 13, 2)

showPie(dat_uc, 1, 13, 1)

showPie(dat_uc, 1, 13, 0)

showPie(dat_uc, 1, 14, 2)

showPie(dat_uc, 1, 14, 1)

showPie(dat_uc, 1, 14, 0)

showPie(dat_uc, 1, 15, 2)

showPie(dat_uc, 1, 15, 1)

showPie(dat_uc, 1, 15, 0)

showPie(dat_uc, 1, 16, 2)

showPie(dat_uc, 1, 16, 1)

showPie(dat_uc, 1, 16, 0)

showPie(dat_uc, 1, 17, 2)

showPie(dat_uc, 1, 17, 1)

showPie(dat_uc, 1, 17, 0)

showPie(dat_uc, 1, 18, 2)

showPie(dat_uc, 1, 18, 1)

showPie(dat_uc, 1, 18, 0)

########################################################
## z_category By year
##
## the percentage of different year
########################################################
# Show the pie by year
showPie(dat_uc, 0, '2007', 2)

showPie(dat_uc, 0, '2007', 1)

showPie(dat_uc, 0, '2007', 0)

showPie(dat_uc, 0, '2008', 2)

showPie(dat_uc, 0, '2008', 1)

showPie(dat_uc, 0, '2008', 0)

showPie(dat_uc, 0, '2009', 2)

showPie(dat_uc, 0, '2009', 1)

showPie(dat_uc, 0, '2009', 0)

showPie(dat_uc, 0, '2010', 2)

showPie(dat_uc, 0, '2010', 1)

showPie(dat_uc, 0, '2010', 0)

showPie(dat_uc, 0, '2011', 2)

showPie(dat_uc, 0, '2011', 1)

showPie(dat_uc, 0, '2011', 0)

showPie(dat_uc, 0, '2012', 2)

showPie(dat_uc, 0, '2012', 1)

showPie(dat_uc, 0, '2012', 0)

showPie(dat_uc, 0, '2013', 2)

showPie(dat_uc, 0, '2013', 1)

showPie(dat_uc, 0, '2013', 0)

showPie(dat_uc, 0, '2014', 2)

showPie(dat_uc, 0, '2014', 1)

showPie(dat_uc, 0, '2014', 0)

showPie(dat_uc, 0, '2015', 2)

showPie(dat_uc, 0, '2015', 1)

showPie(dat_uc, 0, '2015', 0)

showPie(dat_uc, 0, '2016', 2)

showPie(dat_uc, 0, '2016', 1)

showPie(dat_uc, 0, '2016', 0)

showPie(dat_uc, 0, '2017', 2)

showPie(dat_uc, 0, '2017', 1)

showPie(dat_uc, 0, '2017', 0)

showPie(dat_uc, 0, '2018', 2)

showPie(dat_uc, 0, '2018', 1)

showPie(dat_uc, 0, '2018', 0)

########################################################
## z_category By Year
##
## the trend
########################################################
# All
showTrend(dat_uc, 0, 2)

# Boy
showTrend(dat_uc, 0, 1)

# Girl
showTrend(dat_uc, 0, 0)

########################################################
## z_category By age
##
## the trend
########################################################
# By Age
# All
showTrend(dat_uc, 1, 2)

# Boy
showTrend(dat_uc, 1, 1)

# Girl
showTrend(dat_uc, 1, 0)

########################################################
## Z-category ~ Height + Weight + age + gender
##
## cluster relationship
########################################################
ggplot(data = dat_uc) + geom_point(mapping = aes(x = weight, y = height, color=z_category))

dat_boy <- dat_uc[which(dat_uc$gender=='boy' ),]
fig_boy <- plot_ly(dat_boy, x=~`weight`, y = ~`height`, z = ~`age`, color = ~`z_category`)
fig_boy <- fig_boy %>% add_markers()
fig_boy
dat_girl <- dat_uc[which(dat_uc$gender=='girl' ),]
fig_girl <- plot_ly(dat_girl, x=~`weight`, y = ~`height`, z = ~`age`, color = ~`z_category`)
fig_girl <- fig_girl %>% add_markers()
fig_girl
########################################################
## Predict Z-category ~ height + weight + age + gender (Machine Learning)
##
## Sample 80% as training data
## Sample 20% as test data
## Use randomForest algorithm to learn the function
## then use the test data to predict the z-category and compare the actual result to chect the funtion
########################################################
dat_learn <- dat_uc
## change gender to number (boy:1, girl:0)
dat_learn$gender[dat_learn$gender == "boy"] <- 1
dat_learn$gender[dat_learn$gender == "girl"] <- 0
dat_learn$gender <- as.numeric(dat_learn$gender)
## measurement date,age (years), age bin have strong Collinearity. So I only remain the age(years)
dat_learn <- cbind(dat_learn[, 3], dat_learn[, 5:7], dat_learn[, 10])
names(dat_learn)[1:4]<-c("age", "gender", "height", "weight")
## sampling 80% for training, 20% for test.
ind <- sample(c(1,2), nrow(dat_learn), replace=T, prob = c(0.8, 0.2))
## training data
trainData <- dat_learn[ind == 1,]
## test data
testData <- dat_learn[ind == 2,]
set.seed(500)
ez.forest <- randomForest(z_category ~., data = trainData,
na.action = na.roughfix,
importance = TRUE)
ez.forest
##
## Call:
## randomForest(formula = z_category ~ ., data = trainData, importance = TRUE, na.action = na.roughfix)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 2.08%
## Confusion matrix:
## normal obese overweight severely thin thin class.error
## normal 51916 0 276 0 60 0.006430376
## obese 0 8862 256 0 0 0.028076333
## overweight 408 211 11869 0 0 0.049567585
## severely thin 3 0 0 71 104 0.601123596
## thin 245 0 0 7 1357 0.156619018
importance(ez.forest,type=2)
## MeanDecreaseGini
## age 6555.2970
## gender 840.4034
## height 8611.3377
## weight 19668.8672
forest.pred <- predict(ez.forest, testData)
forest.perf <- table(testData$z_category, forest.pred,
dnn = c('Actual', 'Predicted'))
forest.perf
## Predicted
## Actual normal obese overweight severely thin thin
## normal 12984 0 67 0 19
## obese 0 2193 67 0 0
## overweight 96 51 3068 0 0
## severely thin 0 0 0 17 26
## thin 61 0 0 1 379
########################################################
## Predict Z-Score ~ age + gender (Long unbalanced panel data)
## Way1: predict the z_score at 18(18.5) year
## Way2: predict the unbalanced z_score
## plm
########################################################
dat_unq <- dat_uc
dat_unq$last_rs = rep(NA, times = nrow(dat_unq))
bId <- 0
z18 <- 0
for (i in 1:nrow(dat_unq)) {
id <- dat_unq[i,1]$ID
if (id == bId) {
if (nrow(z18) != 0) {
dat_unq$last_rs[i] = z18$z_score
}
}
else {
z18 <- dat_unq[which(dat_unq$ID==id & dat_unq$new_age_bin == '18.5'), 'z_score']
if (nrow(z18) == 0) {
z18 <- dat_unq[which(dat_unq$ID==id & dat_unq$new_age_bin == '18'), 'z_score']
}
if (nrow(z18) != 0) {
dat_unq$last_rs[i] = z18$z_score
} else {
dat_unq$last_rs[i] = NA
z18$z_score <- NA
}
bId <- id
}
}
dat_Last <- dat_unq[which(is.na(dat_unq$last_rs) == FALSE ), ]
fitPlm1 <- plm(z_score~ gender + new_age_bin, data=dat_unq, model = 'random', index=c("ID", "new_age_bin"))
summary(fitPlm1)
## Oneway (individual) effect Random Effect Model
## (Swamy-Arora's transformation)
##
## Call:
## plm(formula = z_score ~ gender + new_age_bin, data = dat_unq,
## model = "random", index = c("ID", "new_age_bin"))
##
## Unbalanced Panel: n = 13540, T = 1-22, N = 94674
##
## Effects:
## var std.dev share
## idiosyncratic 0.2061 0.4540 0.123
## individual 1.4669 1.2112 0.877
## theta:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6490 0.8487 0.8823 0.8698 0.9037 0.9203
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.2921 -0.2651 0.0064 -0.0003 0.2752 4.4151
##
## Coefficients:
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept) 0.2753754 0.0356370 7.7272 1.099e-14 ***
## gendergirl -0.0470912 0.0212205 -2.2191 0.0264773 *
## new_age_bin6.5 0.0351477 0.0337905 1.0402 0.2982638
## new_age_bin7 0.0680335 0.0331229 2.0540 0.0399788 *
## new_age_bin7.5 0.0927227 0.0330026 2.8096 0.0049610 **
## new_age_bin8 0.1325888 0.0330846 4.0076 6.135e-05 ***
## new_age_bin8.5 0.1645342 0.0330953 4.9715 6.643e-07 ***
## new_age_bin9 0.1975643 0.0331715 5.9559 2.587e-09 ***
## new_age_bin9.5 0.1943423 0.0332092 5.8521 4.855e-09 ***
## new_age_bin10 0.1947005 0.0332475 5.8561 4.739e-09 ***
## new_age_bin10.5 0.1968568 0.0332809 5.9150 3.319e-09 ***
## new_age_bin11 0.1981328 0.0333163 5.9470 2.731e-09 ***
## new_age_bin11.5 0.1926030 0.0333386 5.7772 7.596e-09 ***
## new_age_bin12 0.1852653 0.0333395 5.5569 2.746e-08 ***
## new_age_bin12.5 0.1891827 0.0333541 5.6719 1.412e-08 ***
## new_age_bin13 0.1727504 0.0333721 5.1765 2.261e-07 ***
## new_age_bin13.5 0.1479326 0.0334039 4.4286 9.485e-06 ***
## new_age_bin14 0.1141124 0.0334420 3.4122 0.0006443 ***
## new_age_bin14.5 0.0626004 0.0334585 1.8710 0.0613467 .
## new_age_bin15 0.0269095 0.0334964 0.8034 0.4217699
## new_age_bin15.5 -0.0069914 0.0335210 -0.2086 0.8347853
## new_age_bin16 -0.0203785 0.0335312 -0.6077 0.5433547
## new_age_bin16.5 -0.0253267 0.0335797 -0.7542 0.4507137
## new_age_bin17 -0.0390515 0.0336446 -1.1607 0.2457615
## new_age_bin17.5 -0.0595620 0.0337190 -1.7664 0.0773254 .
## new_age_bin18 -0.0522465 0.0338587 -1.5431 0.1228129
## new_age_bin18.5 -0.0573948 0.0344423 -1.6664 0.0956333 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 19924
## Residual Sum of Squares: 19504
## R-Squared: 0.021062
## Adj. R-Squared: 0.020793
## Chisq: 2036.32 on 26 DF, p-value: < 2.22e-16
fitPlm2 <- plm(z_score~ gender + new_age_bin, data=dat_unq, model = "within", index=c("ID", "new_age_bin"))
summary(fitPlm2)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = z_score ~ gender + new_age_bin, data = dat_unq,
## model = "within", index = c("ID", "new_age_bin"))
##
## Unbalanced Panel: n = 13540, T = 1-22, N = 94674
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -4.28995 -0.22002 0.00000 0.22726 4.64759
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## new_age_bin6.5 0.037084 0.034047 1.0892 0.2760636
## new_age_bin7 0.070264 0.033417 2.1026 0.0355039 *
## new_age_bin7.5 0.093949 0.033306 2.8207 0.0047926 **
## new_age_bin8 0.133462 0.033395 3.9964 6.436e-05 ***
## new_age_bin8.5 0.164771 0.033409 4.9319 8.158e-07 ***
## new_age_bin9 0.198309 0.033490 5.9213 3.206e-09 ***
## new_age_bin9.5 0.193785 0.033531 5.7793 7.528e-09 ***
## new_age_bin10 0.193976 0.033572 5.7779 7.589e-09 ***
## new_age_bin10.5 0.196426 0.033608 5.8447 5.095e-09 ***
## new_age_bin11 0.198331 0.033645 5.8947 3.767e-09 ***
## new_age_bin11.5 0.192948 0.033670 5.7306 1.004e-08 ***
## new_age_bin12 0.185672 0.033673 5.5140 3.518e-08 ***
## new_age_bin12.5 0.188918 0.033689 5.6076 2.058e-08 ***
## new_age_bin13 0.172058 0.033710 5.1040 3.332e-07 ***
## new_age_bin13.5 0.147179 0.033744 4.3616 1.293e-05 ***
## new_age_bin14 0.113281 0.033786 3.3529 0.0008002 ***
## new_age_bin14.5 0.059502 0.033810 1.7599 0.0784308 .
## new_age_bin15 0.021525 0.033859 0.6357 0.5249447
## new_age_bin15.5 -0.011586 0.033888 -0.3419 0.7324238
## new_age_bin16 -0.023808 0.033900 -0.7023 0.4824844
## new_age_bin16.5 -0.027785 0.033950 -0.8184 0.4131234
## new_age_bin17 -0.040330 0.034017 -1.1856 0.2357924
## new_age_bin17.5 -0.060214 0.034095 -1.7661 0.0773894 .
## new_age_bin18 -0.053063 0.034240 -1.5498 0.1212045
## new_age_bin18.5 -0.060139 0.034841 -1.7261 0.0843290 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 17120
## Residual Sum of Squares: 16720
## R-Squared: 0.023339
## Adj. R-Squared: -0.13999
## F-statistic: 77.5311 on 25 and 81109 DF, p-value: < 2.22e-16
fitPlm3 <- plm(last_rs~ gender + z_score + new_age_bin, data=dat_Last, model = 'pooling', effect = "twoways", index=c("ID", "new_age_bin"))
summary(fitPlm3)
## Pooling Model
##
## Call:
## plm(formula = last_rs ~ gender + z_score + new_age_bin, data = dat_Last,
## effect = "twoways", model = "pooling", index = c("ID", "new_age_bin"))
##
## Unbalanced Panel: n = 3822, T = 1-22, N = 30915
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -4.033535 -0.296469 -0.012373 0.304697 4.639339
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## (Intercept) -0.1154118 0.0679273 -1.6990 0.08932 .
## gendergirl 0.0354422 0.0068619 5.1651 2.418e-07 ***
## z_score 0.8731876 0.0027568 316.7387 < 2.2e-16 ***
## new_age_bin8 -0.0900486 0.0822225 -1.0952 0.27345
## new_age_bin8.5 -0.0849295 0.0766365 -1.1082 0.26778
## new_age_bin9 -0.0779625 0.0747532 -1.0429 0.29699
## new_age_bin9.5 -0.0595303 0.0733562 -0.8115 0.41707
## new_age_bin10 -0.0860052 0.0724358 -1.1873 0.23511
## new_age_bin10.5 -0.0852668 0.0715346 -1.1920 0.23328
## new_age_bin11 -0.1063984 0.0710718 -1.4971 0.13439
## new_age_bin11.5 -0.1137045 0.0707826 -1.6064 0.10820
## new_age_bin12 -0.1191120 0.0704488 -1.6908 0.09089 .
## new_age_bin12.5 -0.1127691 0.0701931 -1.6066 0.10816
## new_age_bin13 -0.0901450 0.0699564 -1.2886 0.19755
## new_age_bin13.5 -0.0745277 0.0699142 -1.0660 0.28644
## new_age_bin14 -0.0474844 0.0698774 -0.6795 0.49680
## new_age_bin14.5 0.0228357 0.0696322 0.3279 0.74295
## new_age_bin15 0.0641466 0.0693658 0.9248 0.35510
## new_age_bin15.5 0.0780343 0.0692040 1.1276 0.25950
## new_age_bin16 0.0896932 0.0690477 1.2990 0.19395
## new_age_bin16.5 0.0987755 0.0689407 1.4328 0.15194
## new_age_bin17 0.1129036 0.0688545 1.6397 0.10107
## new_age_bin17.5 0.1182883 0.0687366 1.7209 0.08528 .
## new_age_bin18 0.1157509 0.0686008 1.6873 0.09155 .
## new_age_bin18.5 0.1207992 0.0689606 1.7517 0.07983 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 47703
## Residual Sum of Squares: 11221
## R-Squared: 0.76477
## Adj. R-Squared: 0.76459
## F-statistic: 4184.55 on 24 and 30890 DF, p-value: < 2.22e-16